capture program drop EpisodesFileCreator

program define EpisodesFileCreator

version 13.1
*! version 1.0 January 22, 2016

syntax [using ], atrisk(string) 

display ""
display "Start of the program EPISODES FILE CREATOR"
which EpisodesFileCreator





qui:{
clear all
set more off, permanently
}

qui:{
capture ado uninstall carryforward
capture ssc install carryforward
capture which carryforward
}
if _rc!=0{
	display ""
	display "	The program carryforward needs to be used. There were problems installing it."
	display "	The program EPISODES FILE CREATOR was stopped."
	display "	Install carryforward and restart the EPISODES FILE CREATOR"
	qui exit
}

qui{
********************************************************************************
**** 		PART 1: READ AND PREPARE VARIABLE SET UP FILE
********************************************************************************
}
display ""
display "	Part 1: Reading and preparing the variable set up file"


*Checking that the variable setup file is found
qui capture use VarSetup.dta, clear
	if _rc!=0{
	display "	File VarSetup.dta not found."
	display "	The program EPISODES FILE CREATOR was stopped."
	qui exit
	}


*Checking that the variable setup file contains the correct columns
qui gen i = 1 
qui gen problem = 0 
foreach VarName in Type Duration Transition { 
	local j = i	
	capture confirm variable `VarName' 
	if !_rc{
	local missing`j' = ""
	}
	else{
	local missing`j' = "`VarName'"
	qui replace problem = 1
	}
	qui: replace i = i+1
}
drop i 

if problem == 1 {
display ""
display "	The variable setup file is not compatible."
display "	Missing variables: " "`missing1'" " " "`missing2'" " " "`missing3'" " " "`missing4'" 
display "	Fix the file and restart the program."
exit 
}
if problem == 0 {
display "	The variable setup file contains the correct columns. The program will continue running"
}

drop problem




qui{
*Replacing name of atrisk variable in the variable setup file
replace Type="AtRisk" if Type=="`atrisk'"
save VarSetup1.dta, replace

*Transition
use VarSetup1.dta, clear
keep Type Transition
save TypeTransition.dta, replace



*Duration
use VarSetup1.dta, clear
keep if Duration=="Continuous" & Transition!="End"  
keep Type 
save TypeDuration.dta, replace

*Minus1
use VarSetup1.dta, clear
keep if Duration=="Continuous" & Transition!="End" 
keep Type
save TypeReplaceMin1.dta, replace
}


*Labels
qui gen problem = 0 
qui:{
capture{
use `using',clear
gen problem = 0 


*Checking that the label file contains the correct columns
qui gen i = 1 
foreach VarName in Type Value ValueLabel { 
	local j = i	
	capture confirm variable `VarName' 
	if !_rc{
	local missing`j' = ""
	}
	else{
	local missing`j' = "`VarName'"
	qui replace problem = 1
	}
	qui: replace i = i+1
}
drop i 
}
}
if problem == 1 {
display ""
display "	The file of labels is not compatible."
display "	Missing fields: " "`missing1'" " " "`missing2'" " " "`missing3'" 
display "	Fix the file and restart the program."
exit 
}
qui{
drop problem
capture rename ValueLabel vallab
tempfile vlab 
capture save `vlab',replace
}

qui{
use VarSetup1.dta, clear
gen ValueLabel = ""
capture merge 1:m Type using `vlab'
capture drop if _merge==2
capture drop _merge
capture replace ValueLabel= vallab
keep if ValueLabel!=""
keep Type Value ValueLabel

capture{
gen maxrow=_N
gen row = _n
gen i = 1

while i<=maxrow & _N!=0{
gen temp = Value if i==row
destring temp, replace
	capture confirm string variable temp
	if !_rc{
	drop if i==row
	}
replace i = i+1
drop temp
}
drop maxrow row i
destring Value, replace
gen temp = subinstr(ValueLabel," ","_",.)
drop ValueLabel
rename temp ValueLabel
}

save ValueLabel.dta, replace
}


qui{
********************************************************************************
**** 		PART 2: READ AND PREPARE THE CHRONICLE FILE
********************************************************************************
}
display ""
display "	Part 2: Reading and preparing the chronicle file"


qui capture use Chronicle.dta, clear
	if _rc!=0{
	display "	File Chronicle.dta not found."
	display "	The program EPISODES FILE CREATOR was stopped."
	qui exit
	 }
	 
qui capture rename ID_I Id_I

qui gen i = 1 
gen problem = 0 
foreach VarName in Id_I Day Month Year  DayFrac Type Value { 
	local j = i	
	capture confirm variable `VarName' 
	if !_rc{
	local missing`j' = ""
	}
	else{
	local missing`j' = "`VarName'"
	qui replace problem = 1
	}
	qui replace i = i+1
}
drop i 

if problem == 1 {
display ""
display "	The chronicle file is not compatible."
display "	Missing variables: " "`missing1'" " " "`missing2'" " " "`missing3'" " " "`missing4'" " " "`missing5'" " " "`missing6'"
display "	Fix the file and restart the program."
exit 
}
if problem == 0 {
display "	The chronicle file contains the correct columns. The program will continue running"
}

drop problem



qui{
*Generating DateFormat for Types which have no Value but the Timestamp is their value
use Chronicle.dta, clear
merge m:1 Type using TypeTransition.dta, nogen norep
capture gen emptyType=(Value=="")
keep if Transition!="End" & Type!="AtRisk"
collapse (max) maxempty=emptyType (min) minempty=emptyType, by(Type)
capture keep if maxempty==1 & minempty==1
capture keep Type
capture duplicates drop
capture gen DateFormat="YMD"
save TypeDateFormat.dta, replace
}


qui{
*Replacing name of atrisk variable in the chronicle file
use Chronicle.dta, clear

qui capture rename ID_I Id_I


replace Type="AtRisk" if Type=="`atrisk'"
save "ExtractionFile.dta", replace
}

qui use "ExtractionFile.dta", clear
qui keep if Type=="AtRisk"

local nrows = _N
if `nrows' ==0{
display ""
display "	At risk variable `atrisk' not found in the chronicle file." 
display "	Fix the file and restart the program."
exit 
}


qui{
use "ExtractionFile.dta", clear

merge m:1 Type using TypeTransition.dta, nogen norep
replace Transition="Start" if Type=="AtRisk"


foreach valueVar of varlist Month Day Year { 
		destring `valueVar',replace
}

*Assigning values to Types which have no Value but the Timestamp is their value
egen temp = concat(Year Month Day), punct(-) 
replace Value = temp if Value=="" & Transition!="End"
drop temp


gen ChangeDate = mdy(Month,Day,Year) 
drop Month Day Year 
format ChangeDate %td

replace DayFrac = 0 if DayFrac==.


/*Change date for individuals with only one date in the database, who have 
at least one date collision, and who have an event occuring on this date. 
*/
gen dtype=(ChangeDate==.)
bysort Id_I ChangeDate dtype: gen temp = _n
bysort Id_I ChangeDate dtype: gen temp1 = (temp==1 & ChangeDate!=.)
bysort Id_I temp1: gen temp2 = _n if temp1==1
bysort Id_I : egen numDate = max(temp2)

drop temp temp1 temp2 dtype

tempfile DayFracOneDate
save `DayFracOneDate.dta', replace

use `DayFracOneDate.dta', clear
keep if numDate==1
drop if ChangeDate==.
collapse (max) DayFrac1=DayFrac, by (Id_I ChangeDate )
gen Transition="End"
tempfile DayFracOneDate1
save `DayFracOneDate1.dta', replace

use `DayFracOneDate.dta', clear
drop numDate
merge m:1 Id_I ChangeDate Transition using `DayFracOneDate1.dta'
drop if _merge==2
drop _merge
replace DayFrac1 = 0 if DayFrac1==.

replace DayFrac=DayFrac1 if DayFrac==0 & DayFrac1!=0

drop DayFrac1


*Add date fraction to fix date collisions
replace DayFrac = 0 if DayFrac==.
replace ChangeDate = ChangeDate + DayFrac 
drop DayFrac 

quietly: compress


save "ExtractionFile.dta", replace


}




qui{
use "ExtractionFile.dta", clear

bysort Id_I ChangeDate Type: gen checkRowOrder = _n
egen max = max(checkRowOrder)
}                           

if max==1{
display "	No date collisions found. The program will continue running"
}

else{
qui bysort Type: egen maxType = max(checkRowOrder)

disp""
disp "	Errors found in the data"
disp "	The variable types which do not have unique rows per each individual and date are:"
tab Type if maxType>1
disp""
disp "	These problems must be fixed in the source dataset, after which the program"
disp "	must be restarted, giving in input the corrected chronicle file". 

exit
}

qui:{
********************************************************************************
**** 						PART 3 : CHECK TYPES 
********************************************************************************
}

display ""
display "	Part 3: Checking that the chronicle and variable setup files contain the same types."

qui:{
use Chronicle.dta, clear
keep Type
duplicates drop
gen extraction = 1
save Types_extraction.dta, replace

use VarSetup.dta, clear
keep Type
keep Type
duplicates drop
gen varSetup = 1
save Types_varSetup.dta, replace

use Types_extraction.dta, clear
merge 1:1 Type using Types_varSetup.dta

egen totalExtraction = sum(extraction)
egen totalVarSetup = sum(varSetup)

local a = totalExtraction
local b = totalVarSetup 
}

if `a'==`b' {
display "	The chronicle and variable setup files contain the same types."
display "	The program will continue running."
}
if `a'>`b' {
display "	The following types contained in the chronicle file are not "
display "	found in the variable setup file:"
list Type if _merge==1

disp "	These problems must be fixed in the source dataset, after which the program"
disp "	must be restarted, giving in input the corrected chronicle file".
exit 
}

if `a'<`b' {
display "	The following types contained in the variable setup file are not "
display "	found in the chronicle file:"
list Type if _merge==2

disp "	These problems must be fixed in the source dataset, after which the program"
disp "	must be restarted, giving in input the corrected chronicle file". 
exit
}



qui{
********************************************************************************
**** 						PART 4 : TIME-VARYING COVARIATES
**** This part creates a wide file containing one column for each type of 
**** covariate that changes value at the beginning of a spell (Transition=Start).
********************************************************************************
}
display ""
display "	Part 4: Rectangularisation of time-varying variables (this step may take a long time)"

qui{
use "ExtractionFile.dta", clear

keep if  Transition=="Start" | Transition=="start" | Transition=="START"
drop Transition

if _N==0{
use "ExtractionFile.dta", clear
keep Id_I
duplicates drop 
gen Type = "EmptyVar1"
gen Value = "EmptyVal1"
gen Day = 1
gen Month = 1
gen Year = 1900
gen DayFrac=.
}



encode Type, generate(Type_num)
egen maxType = max(Type_num)
local j = maxType
drop maxType

tempfile all
save `all'

keep Id_I ChangeDate

capture{
bysort Id_I ChangeDate: gen row = _n
drop if row!=1
drop row
}

tempfile dates
save `dates'

quietly:{

forvalues i = 1/`j' {
 use `all', clear
	
	keep if Type_num==`i'
	drop Type_num
	local varName = Type
	rename Value Value`varName'
	drop Type
	tempfile var`i'
	save `var`i''
}

 use `dates', clear

 forvalues i = 1/`j' {
   merge 1:1 Id_I ChangeDate using `var`i'', nogenerate noreport 
 }

}


/*
use "ExtractionFile.dta", clear
keep if Transition=="Start"
keep Id_I ChangeDate
capture{
bysort Id_I ChangeDate: gen row = _n
drop if row!=1
drop row
}

gen mrow=1

tempfile dates
save `dates'


use "ExtractionFile.dta", clear
keep if Transition=="Start"
keep Type
duplicates drop
gen mrow = _n
gen T_=""

reshape wide T_,j(Type) i(mrow) string
keep if mrow==1

tempfile types
save `types'

use "ExtractionFile.dta", clear
drop Transition
drop if ChangeDate==.
tempfile mergef
save `mergef'


use `dates',clear
joinby mrow using `types'


foreach varname of varlist T_*{
*foreach varname of varlist T_AtRisk{
gen Type = substr("`varname'",3,.)
merge 1:1 Id_I ChangeDate Type using `mergef'
drop if _merge==2
drop _merge
local name = substr("`varname'",3,.)
rename Value `name'
drop Type
}

drop T_*
*/

rename ChangeDate date1
sort Id_I date1
quietly: compress


save "Covariates_time_varying.dta", replace
}



qui{
********************************************************************************
****					PART 5: TIME-INVARIANT COVARIATES
**** This part of the program creates a wide file containing one column for each  
**** type of time-fixed covariate (Transition=Invariant) 
********************************************************************************
}
display ""
display "	Part 5: Rectangularisation of time-invariant variables"

qui{
use "ExtractionFile.dta", clear

keep if Transition=="Invariant" | Transition=="invariant" | Transition=="INVARIANT"
drop Transition ChangeDate

if _N==0{
use "ExtractionFile.dta", clear
keep Id_I
duplicates drop 
gen Type = "EmptyVar2"
gen Value = "EmptyVal2"
}


encode Type, generate(Type_num)
egen maxType = max(Type_num)
local j = maxType
drop maxType

tempfile all
save `all'

keep Id_I 

capture{
bysort Id_I : gen row = _n
drop if row!=1
drop row
}

tempfile dates
save `dates'


quietly:{

forvalues i = 1/`j' {

 use `all', clear

	keep if Type_num==`i'
	drop Type_num
	local varName = Type
	rename Value Value`varName'
	drop Type
	tempfile var`i'
	save `var`i''
}
 use `dates', clear

 forvalues i = 1/`j' {
   merge 1:1 Id_I using `var`i'', nogenerate noreport 
 }
}

save "Covariates_time_invariant.dta", replace
}


qui{
********************************************************************************
**** 						PART 6: EVENTS 
**** This part of the program creates a wide file containing one column for each  
**** type of event occurring at the end of a spell (Transition=End).
********************************************************************************
}
display ""
display "	Part 6: Rectangularisation of events"

qui{
use "ExtractionFile.dta", clear

keep if Transition=="End" | Transition=="END" | Transition=="end"
drop Transition


if _N==0{
use "ExtractionFile.dta", clear
keep Id_I
duplicates drop 
gen Type = "EmptyVar0"
gen Value = "EmptyVal0"
gen Day = 1
gen Month = 1
gen Year = 1900
gen DayFrac=.
}


*Replacing value to 1 if Value is missing
replace Value = "1" if Value==""

encode Type, generate(Type_num)
egen maxType = max(Type_num)
local j = maxType
drop maxType

tempfile all
save `all'

keep Id_I ChangeDate

capture{
bysort Id_I ChangeDate: gen row = _n
drop if row!=1
drop row
}

tempfile dates
save `dates'

quietly:{

forvalues i = 1/`j' {
 use `all', clear
	
	keep if Type_num==`i'
	drop Type_num
	local varName = Type
	rename Value Value`varName'
	drop Type
	tempfile var`i'
	save `var`i''

}

 use `dates', clear

 forvalues i = 1/`j' {
   merge 1:1 Id_I ChangeDate using `var`i'', nogenerate noreport 
 }
}

rename ChangeDate date2
sort Id_I date2
save "Events_end_dates.dta", replace
}



qui{
********************************************************************************
****							PART 7: SPELLS CONSTRUCTION
****  This part of the program constructs spells and merges start date and   
****  time-fixed covariates and end-date events.
********************************************************************************
}
display ""
display "	Part 7: Construction of spells"

qui{
use "ExtractionFile.dta", clear

drop if ChangeDate==.

sort Id_I  ChangeDate Transition
keep Id_I  ChangeDate Transition

duplicates drop Id_I ChangeDate Transition,force

bysort Id_I: gen numRows = _N
gen rowType = "-1" if numRows!=2
replace rowType = Transition if numRows==2

keep Id_I  ChangeDate rowType
duplicates drop Id_I ChangeDate rowType,force

drop rowType


*Spells
sort Id_I  ChangeDate

gen date1 = ChangeDate
gen date2 = ChangeDate[_n+1] if Id_I==Id_I[_n+1]
format date1 date2 %td

drop ChangeDate

order date1 date2,after(Id_I)

sort Id_I date1 date2
drop if date2==.

** Merge time-varying covariates 
sort Id_I date1
merge 1:1 Id_I date1 using  "Covariates_time_varying.dta"
drop if _merge==2
drop _merge

*Merge time-invariant covariates
sort Id_I 
merge m:1 Id_I using  "Covariates_time_invariant.dta"
drop if _merge==2
drop _merge


** Merge events on end dates
sort Id_I date2
merge 1:1 Id_I date2 using  "Events_end_dates.dta"
drop if _merge==2
drop _merge

renpfix Value

capture drop EmptyVar0
capture drop EmptyVar1
capture drop EmptyVar2


erase "ExtractionFile.dta"
erase "Covariates_time_varying.dta"
erase "Events_end_dates.dta"
erase "Covariates_time_invariant.dta"


save "PreEpisodes_file.dta",replace
}



qui{
********************************************************************************
****			 PART 8: FORMATTING OF THE EPISODES FILE
**** The purpose of this part of the program is to convert variable formats and  
**** fill down down missing information.
********************************************************************************
}
display ""
display "	Part 8: Formatting of the episodes file (based on the variable setup file)"

qui{
use TypeDateFormat.dta, clear
export delimited using TypeDateFormat.csv, delim(" ") novarnames replace


use TypeDuration.dta, clear
export delimited using TypeDuration.csv, delim(" ") novarnames replace


use TypeReplaceMin1.dta, clear
merge 1:1 Type using TypeDateFormat.dta
keep if _merge==1
drop _merge
export delimited using TypeReplaceMin1.csv, delim(" ") novarnames replace

use TypeTransition.dta, clear
keep if Transition == "End"
keep Type 
capture file close ReplaceEv0
capture file close TypeTransitionEv.csv
export delimited using TypeTransitionEv.csv, delim(" ") novarnames replace


use ValueLabel.dta, clear 
capture{
bysort Type (Value): gen row = _n
keep if row ==1
drop row
}
export delimited using ValueLabelTypes.csv, delim(" ") novarnames replace

use ValueLabel.dta, clear
capture{
bysort Type (Value): gen row = _n
keep if row>1
drop row
}
export delimited using ValueLabelValues.csv, delim(" ") novarnames replace


use "PreEpisodes_file.dta", clear





* Fill down variables where succeeding rows have missing values. 
capture file close Duration

file open Duration using TypeDuration.csv,read
file read Duration line

while r(eof)==0 {

local varName = word("`line'",1)
bysort Id_I (date1): carryforward `varName',replace		
file read Duration line
		
}
file close Duration


* Replace NoValue with empty cells for cases where there was no value
*of a context variable on the date of entry of an individual into the context
foreach valueVar of varlist * { 
		capture replace `valueVar'="" if `valueVar'=="NoValue"
}


*Reformat variables
capture file close DateFormat

file open DateFormat using TypeDateFormat.csv,read
file read DateFormat line

while r(eof)==0 {

local varName = word("`line'",1)
local dateType = word("`line'",2)

rename `varName' temp 
gen `varName' = date(temp,"`dateType'")
drop temp
format `varName' %td
		
file read DateFormat line
		
}
file close DateFormat



destring _all,replace


*Replace missing values by -1
capture file close ReplaceMin1

file open ReplaceMin1 using TypeReplaceMin1.csv,read
file read ReplaceMin1 line

while r(eof)==0 {

local varName = word("`line'",1)

capture confirm numeric variable `varName'
if !_rc {
replace `varName'=-1 if `varName'==.
}	
file read ReplaceMin1 line
		
}
file close ReplaceMin1


*Add labels
file open TypeLabel using ValueLabelTypes.csv,read
file read TypeLabel line

while r(eof)==0 {
local varName = word("`line'",1)
local labName = "`varName'l"
local valName = word("`line'",2)
local labVal = word("`line'",3)
label define `labName' `valName' "`labVal'"
file read TypeLabel line
}
file close TypeLabel

file open TypeValues using ValueLabelValues.csv,read
file read TypeValues line

while r(eof)==0 {
local varName = word("`line'",1)
local labName = "`varName'l"
local valName = word("`line'",2)
local labVal = word("`line'",3)
label define `labName' `valName' "`labVal'",add
file read TypeValues line
}
file close TypeValues

file open TypeLabel using ValueLabelTypes.csv,read
file read TypeLabel line
while r(eof)==0 {
local varName = word("`line'",1)
local labName = "`varName'l"
label values `varName' `labName' 
file read TypeLabel line
}
file close TypeLabel

}

qui{
********************************************************************************
****			 PART 9: DROPPING SPELLS WHEN THE INDIVIDUAL IS NOT AT RISK
****			All rows when the variable AtRisk is not 1 are dropped. 
********************************************************************************
}
display ""
display "	Part 9: Dropping spells when the individual is not at risk"
qui{
keep if AtRisk==1
drop AtRisk
}


qui{
local time_string = subinstr(ltrim(subinstr(c(current_date)+"_" +c(current_time), ":", "_", .)), " ", "_", .)

 local fileName = "Episodes_file_`time_string'.dta"
 
save Episodes_file_`time_string'.dta, replace


*Erase temporary files
erase "PreEpisodes_file.dta"


erase TypeTransition.dta
erase TypeDateFormat.dta
erase TypeDuration.dta
erase TypeReplaceMin1.dta
erase TypeDateFormat.csv
erase TypeDuration.csv
erase TypeReplaceMin1.csv
erase ValueLabelTypes.csv
erase ValueLabelValues.csv
erase ValueLabel.dta
erase TypeTransitionEv.csv
erase VarSetup1.dta
erase Types_extraction.dta
erase Types_varSetup.dta
 }


display ""
display "End of the program EPISODES FILE CREATOR"
display ""
display "Name of episodes file: " "`fileName'"
*/
end











